In [24]:
#loading the required libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import plotly.express as px
In [3]:
#loading the data sets by pandas
df_ipl = pd.read_csv('matches.csv')
In [5]:
#checking the top 5 records 
df_ipl.head()
Out[5]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2 umpire3
0 1 2017 Hyderabad 05-04-2017 Sunrisers Hyderabad Royal Challengers Bangalore Royal Challengers Bangalore field normal 0 Sunrisers Hyderabad 35 0 Yuvraj Singh Rajiv Gandhi International Stadium, Uppal AY Dandekar NJ Llong NaN
1 2 2017 Pune 06-04-2017 Mumbai Indians Rising Pune Supergiant Rising Pune Supergiant field normal 0 Rising Pune Supergiant 0 7 SPD Smith Maharashtra Cricket Association Stadium A Nand Kishore S Ravi NaN
2 3 2017 Rajkot 07-04-2017 Gujarat Lions Kolkata Knight Riders Kolkata Knight Riders field normal 0 Kolkata Knight Riders 0 10 CA Lynn Saurashtra Cricket Association Stadium Nitin Menon CK Nandan NaN
3 4 2017 Indore 08-04-2017 Rising Pune Supergiant Kings XI Punjab Kings XI Punjab field normal 0 Kings XI Punjab 0 6 GJ Maxwell Holkar Cricket Stadium AK Chaudhary C Shamshuddin NaN
4 5 2017 Bangalore 08-04-2017 Royal Challengers Bangalore Delhi Daredevils Royal Challengers Bangalore bat normal 0 Royal Challengers Bangalore 15 0 KM Jadhav M Chinnaswamy Stadium NaN NaN NaN
In [7]:
#looking the number of rows and columns in the data set
df_ipl.shape
Out[7]:
(756, 18)
In [230]:
#Getting the frequency of most man of the match awards and ploting it 
manofthematch=df_ipl['player_of_match'].value_counts().head(10)
fig=px.bar(manofthematch, y='player_of_match', color='player_of_match')
fig.show(renderer='notebook')
In [105]:
df_ipl.head(2)
Out[105]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2 umpire3
0 1 2017 Hyderabad 05-04-2017 Sunrisers Hyderabad Royal Challengers Bangalore Royal Challengers Bangalore field normal 0 Sunrisers Hyderabad 35 0 Yuvraj Singh Rajiv Gandhi International Stadium, Uppal AY Dandekar NJ Llong NaN
1 2 2017 Pune 06-04-2017 Mumbai Indians Rising Pune Supergiant Rising Pune Supergiant field normal 0 Rising Pune Supergiant 0 7 SPD Smith Maharashtra Cricket Association Stadium A Nand Kishore S Ravi NaN
In [231]:
#checking the various types of the results
resultplot=df_ipl['result'].value_counts()
px.bar(y=resultplot.index,x=resultplot.values,color=resultplot.index, color_discrete_sequence=px.colors.qualitative.Plotly)
In [133]:
df_ipl.head(2)
Out[133]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2 umpire3
0 1 2017 Hyderabad 05-04-2017 Sunrisers Hyderabad Royal Challengers Bangalore Royal Challengers Bangalore field normal 0 Sunrisers Hyderabad 35 0 Yuvraj Singh Rajiv Gandhi International Stadium, Uppal AY Dandekar NJ Llong NaN
1 2 2017 Pune 06-04-2017 Mumbai Indians Rising Pune Supergiant Rising Pune Supergiant field normal 0 Rising Pune Supergiant 0 7 SPD Smith Maharashtra Cricket Association Stadium A Nand Kishore S Ravi NaN
In [140]:
#checking the null values 
df_ipl.isnull().sum().sort_values(ascending=False)
Out[140]:
umpire3            637
city                 7
winner               4
player_of_match      4
umpire2              2
umpire1              2
venue                0
win_by_wickets       0
win_by_runs          0
id                   0
season               0
result               0
toss_decision        0
toss_winner          0
team2                0
team1                0
date                 0
dl_applied           0
dtype: int64
In [143]:
#checking the persentage to take a deision 
(df_ipl.isnull().mean()*100).round(2).sort_values(ascending = False)
Out[143]:
umpire3            84.26
city                0.93
winner              0.53
player_of_match     0.53
umpire2             0.26
umpire1             0.26
venue               0.00
win_by_wickets      0.00
win_by_runs         0.00
id                  0.00
season              0.00
result              0.00
toss_decision       0.00
toss_winner         0.00
team2               0.00
team1               0.00
date                0.00
dl_applied          0.00
dtype: float64
In [146]:
#droping unwanted columns
df_ipl = df_ipl.drop(columns=['umpire3'])
In [148]:
df_ipl.head(2)
Out[148]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2
0 1 2017 Hyderabad 05-04-2017 Sunrisers Hyderabad Royal Challengers Bangalore Royal Challengers Bangalore field normal 0 Sunrisers Hyderabad 35 0 Yuvraj Singh Rajiv Gandhi International Stadium, Uppal AY Dandekar NJ Llong
1 2 2017 Pune 06-04-2017 Mumbai Indians Rising Pune Supergiant Rising Pune Supergiant field normal 0 Rising Pune Supergiant 0 7 SPD Smith Maharashtra Cricket Association Stadium A Nand Kishore S Ravi
In [152]:
#treating missing values 
df_ipl['city'].replace(np.NaN,'Unknown',inplace=True)

df_ipl['winner'].replace(np.NaN,'Unknown',inplace=True)

df_ipl['player_of_match'].replace(np.NaN,'Unknown',inplace=True)
In [154]:
mode_value = df_ipl["umpire2"].mode()[0]
In [156]:
df_ipl['umpire2'].fillna(mode_value,inplace=True)
In [158]:
mode_value1 = df_ipl["umpire1"].mode()[0]
In [159]:
df_ipl['umpire1'].fillna(mode_value1,inplace=True)
In [165]:
df_ipl.isnull().sum().sort_values(ascending=False)
Out[165]:
player_of_match    4
id                 0
dl_applied         0
umpire1            0
venue              0
win_by_wickets     0
win_by_runs        0
winner             0
result             0
season             0
toss_decision      0
toss_winner        0
team2              0
team1              0
date               0
city               0
umpire2            0
dtype: int64
In [167]:
df_ipl[df_ipl['player_of_match'].isnull()]
Out[167]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2
300 301 2011 Delhi 21-05-2011 Delhi Daredevils Pune Warriors Delhi Daredevils bat no result 0 Unknown 0 0 NaN Feroz Shah Kotla SS Hazare RJ Tucker
545 546 2015 Bangalore 29-04-2015 Royal Challengers Bangalore Rajasthan Royals Rajasthan Royals field no result 0 Unknown 0 0 NaN M Chinnaswamy Stadium JD Cloete PG Pathak
570 571 2015 Bangalore 17-05-2015 Delhi Daredevils Royal Challengers Bangalore Royal Challengers Bangalore field no result 0 Unknown 0 0 NaN M Chinnaswamy Stadium HDPK Dharmasena K Srinivasan
744 11340 2019 Bengaluru 30-04-2019 Royal Challengers Bangalore Rajasthan Royals Rajasthan Royals field no result 0 Unknown 0 0 NaN M. Chinnaswamy Stadium Nigel Llong Ulhas Gandhe
In [169]:
#checking the data and datatypes of the columns
df_ipl.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   id               756 non-null    int64   
 1   season           756 non-null    int64   
 2   city             756 non-null    object  
 3   date             756 non-null    object  
 4   team1            756 non-null    object  
 5   team2            756 non-null    object  
 6   toss_winner      756 non-null    object  
 7   toss_decision    756 non-null    object  
 8   result           756 non-null    object  
 9   dl_applied       756 non-null    int64   
 10  winner           756 non-null    object  
 11  win_by_runs      756 non-null    int64   
 12  win_by_wickets   756 non-null    int64   
 13  player_of_match  752 non-null    category
 14  venue            756 non-null    object  
 15  umpire1          756 non-null    object  
 16  umpire2          756 non-null    object  
dtypes: category(1), int64(5), object(11)
memory usage: 106.0+ KB
In [172]:
#changing the data type of the columns
df_ipl['player_of_match']=df_ipl['player_of_match'].astype('object')
In [173]:
df_ipl.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               756 non-null    int64 
 1   season           756 non-null    int64 
 2   city             756 non-null    object
 3   date             756 non-null    object
 4   team1            756 non-null    object
 5   team2            756 non-null    object
 6   toss_winner      756 non-null    object
 7   toss_decision    756 non-null    object
 8   result           756 non-null    object
 9   dl_applied       756 non-null    int64 
 10  winner           756 non-null    object
 11  win_by_runs      756 non-null    int64 
 12  win_by_wickets   756 non-null    int64 
 13  player_of_match  752 non-null    object
 14  venue            756 non-null    object
 15  umpire1          756 non-null    object
 16  umpire2          756 non-null    object
dtypes: int64(5), object(12)
memory usage: 100.5+ KB
In [175]:
#making sure the we have not missed to treat any null values 
df_ipl.isnull().sum().sort_values(ascending=False)
Out[175]:
id                 0
dl_applied         0
umpire1            0
venue              0
player_of_match    0
win_by_wickets     0
win_by_runs        0
winner             0
result             0
season             0
toss_decision      0
toss_winner        0
team2              0
team1              0
date               0
city               0
umpire2            0
dtype: int64
In [176]:
#Checking which team won the toss most times
df_ipl["toss_winner"].value_counts().head()
Out[176]:
Mumbai Indians                 98
Kolkata Knight Riders          92
Chennai Super Kings            89
Royal Challengers Bangalore    81
Kings XI Punjab                81
Name: toss_winner, dtype: int64
In [181]:
#checking which team bat first and won the match with the plot below 
batingfirst = df_ipl[df_ipl['win_by_runs']!=0]
In [232]:
fig = px.histogram(batingfirst['win_by_runs'])
fig.update_layout(title_text='Distribution by Runs')
fig.update_xaxes(title_text='Runs')
fig.update_yaxes(title_text='Match_count')
In [199]:
#Finding out the number of wins each team after batting first
batingfirst["winner"].value_counts()
Out[199]:
Mumbai Indians                 57
Chennai Super Kings            52
Kings XI Punjab                38
Kolkata Knight Riders          36
Royal Challengers Bangalore    35
Sunrisers Hyderabad            30
Rajasthan Royals               27
Delhi Daredevils               25
Deccan Chargers                18
Pune Warriors                   6
Rising Pune Supergiant          5
Delhi Capitals                  3
Kochi Tuskers Kerala            2
Rising Pune Supergiants         2
Gujarat Lions                   1
Name: winner, dtype: int64
In [207]:
#checking the second bating and who wont the match most by wickets with plots 
secondbating = df_ipl[df_ipl['win_by_wickets']!=0]
In [233]:
fig = px.histogram(secondbating['win_by_wickets'])
fig.update_layout(title_text='Distribution by Runs')
fig.update_xaxes(title_text='Wikets')
fig.update_yaxes(title_text='Match_count')
In [218]:
#secondbated and wont the most match
secondbating['winner'].value_counts()
Out[218]:
Kolkata Knight Riders          56
Mumbai Indians                 50
Royal Challengers Bangalore    48
Chennai Super Kings            48
Rajasthan Royals               46
Kings XI Punjab                42
Delhi Daredevils               42
Sunrisers Hyderabad            27
Gujarat Lions                  12
Deccan Chargers                11
Pune Warriors                   6
Delhi Capitals                  6
Rising Pune Supergiant          5
Kochi Tuskers Kerala            4
Rising Pune Supergiants         3
Name: winner, dtype: int64
In [221]:
#looking at the number of the matches played each season
df_ipl['season'].value_counts()
Out[221]:
2013    76
2012    74
2011    73
2010    60
2014    60
2016    60
2018    60
2019    60
2017    59
2015    59
2008    58
2009    57
Name: season, dtype: int64
In [223]:
#looking at the number of the matches played each city
df_ipl['city'].value_counts()
Out[223]:
Mumbai            101
Kolkata            77
Delhi              74
Bangalore          66
Hyderabad          64
Chennai            57
Jaipur             47
Chandigarh         46
Pune               38
Durban             15
Bengaluru          14
Visakhapatnam      13
Ahmedabad          12
Centurion          12
Rajkot             10
Mohali             10
Indore              9
Dharamsala          9
Johannesburg        8
Cape Town           7
Ranchi              7
Abu Dhabi           7
Cuttack             7
Unknown             7
Port Elizabeth      7
Sharjah             6
Raipur              6
Kochi               5
Kanpur              4
Nagpur              3
Kimberley           3
East London         3
Bloemfontein        2
Name: city, dtype: int64
In [226]:
#Checking the number of team those who won the toss also and match also 
np.sum(df_ipl['toss_winner']==df_ipl['winner'])
Out[226]:
393
In [228]:
#As it is proved toss dose not played a main role to win the match
result = 393/756
percentage = result * 100
print(percentage)
51.98412698412699
In [235]:
#importing other data set
deliveries = pd.read_csv("deliveries.csv")
In [238]:
#seeing top 5 records
deliveries.head()
Out[238]:
match_id inning batting_team bowling_team over ball batsman non_striker bowler is_super_over ... bye_runs legbye_runs noball_runs penalty_runs batsman_runs extra_runs total_runs player_dismissed dismissal_kind fielder
0 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 1 DA Warner S Dhawan TS Mills 0 ... 0 0 0 0 0 0 0 NaN NaN NaN
1 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 2 DA Warner S Dhawan TS Mills 0 ... 0 0 0 0 0 0 0 NaN NaN NaN
2 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 3 DA Warner S Dhawan TS Mills 0 ... 0 0 0 0 4 0 4 NaN NaN NaN
3 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 4 DA Warner S Dhawan TS Mills 0 ... 0 0 0 0 0 0 0 NaN NaN NaN
4 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 5 DA Warner S Dhawan TS Mills 0 ... 0 0 0 0 0 2 2 NaN NaN NaN

5 rows × 21 columns

In [240]:
#checking the rows and columns
deliveries.shape
Out[240]:
(179078, 21)
In [253]:
#flitering the required details
srh = deliveries[(deliveries['match_id']==1) & (deliveries['inning']==1)]
In [254]:
#after filtering checking the SRH team team score
srh['batsman_runs'].value_counts()
Out[254]:
1    57
0    32
4    17
6     9
2     9
3     1
Name: batsman_runs, dtype: int64
In [255]:
#checking how SRH team bats man got out
srh['dismissal_kind'].value_counts()
Out[255]:
caught    3
bowled    1
Name: dismissal_kind, dtype: int64
In [256]:
#in the same way wt we used above checking the other teams score 
RCB = deliveries[(deliveries['match_id']==1) & (deliveries['inning']==2)]
In [257]:
RCB['batsman_runs'].value_counts()
Out[257]:
0    49
1    44
4    15
6     8
2     7
Name: batsman_runs, dtype: int64
In [258]:
# checking RCB team dismissals
RCB['dismissal_kind'].value_counts()
Out[258]:
caught     6
bowled     2
run out    2
Name: dismissal_kind, dtype: int64